# Importing the libraries
import numpy as np
import pandas as pd
from numpy import math
import seaborn as sns
import matplotlib.pyplot as plt


df = pd.read_csv("data_TV.csv")
df.head()


#last 5 rows
df.tail()


# Checking the shape of the dataframe
df.shape

(2617, 8)


# Columns name
df.columns

Index(['first_air_date', 'origin_country', 'original_language', 'name',
       'popularity', 'vote_average', 'vote_count', 'overview'],
      dtype='object')


# Checking data types 
df.dtypes

first_air_date        object
origin_country        object
original_language     object
name                  object
popularity           float64
vote_average         float64
vote_count             int64
overview              object
dtype: object


#Checking Null Values
df.isnull().sum()

first_air_date        6
origin_country        0
original_language     0
name                  0
popularity            0
vote_average          0
vote_count            0
overview             65
dtype: int64


#total null values
df.isnull().sum().sum()

71


df = df.dropna()


df.shape

(2548, 8)


df.isnull().sum()

first_air_date       0
origin_country       0
original_language    0
name                 0
popularity           0
vote_average         0
vote_count           0
overview             0
dtype: int64


df.isna().sum()

first_air_date       0
origin_country       0
original_language    0
name                 0
popularity           0
vote_average         0
vote_count           0
overview             0
dtype: int64


# Information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2548 entries, 0 to 2616
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   first_air_date     2548 non-null   object 
 1   origin_country     2548 non-null   object 
 2   original_language  2548 non-null   object 
 3   name               2548 non-null   object 
 4   popularity         2548 non-null   float64
 5   vote_average       2548 non-null   float64
 6   vote_count         2548 non-null   int64  
 7   overview           2548 non-null   object 
dtypes: float64(2), int64(1), object(5)
memory usage: 179.2+ KB


df.describe()


import seaborn as sns # for making visualizations
from matplotlib import pyplot as plt # for making visualizations


df['original_language'].value_counts()

en    1682
ja     395
es     246
ko      99
fr      18
tr      18
pt      17
de      13
it      10
zh      10
sv       6
da       6
no       5
ru       5
hi       4
th       2
ca       2
he       2
pl       2
is       2
ar       2
tl       1
nl       1
Name: original_language, dtype: int64


#countplot to visualize the number of original language in type column
sns.countplot(df['original_language'])

C:\Users\dnbha\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(

<AxesSubplot:xlabel='original_language', ylabel='count'>


df['vote_average'].value_counts()

7.6    190
8.0    170
7.9    157
7.5    156
7.8    153
8.1    152
8.2    142
7.7    141
8.3    136
7.4    131
7.3    122
7.1     99
7.2     99
8.4     90
8.5     90
8.6     89
7.0     76
8.7     75
6.9     70
6.8     51
6.7     35
6.6     28
6.5     18
6.4     15
6.3     14
6.2      8
6.1      7
5.8      7
6.0      5
5.6      4
5.9      4
5.5      3
5.7      3
4.9      1
3.9      1
4.1      1
9.0      1
5.2      1
5.3      1
8.8      1
0.6      1
Name: vote_average, dtype: int64


#countplot to visualize the number of average vote in column
plt.figure(figsize=(23,10))
sns.countplot(df['vote_average'])

C:\Users\dnbha\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(

<AxesSubplot:xlabel='vote_average', ylabel='count'>


plt.figure(figsize=(23,10))
sns.countplot(df['origin_country'])

C:\Users\dnbha\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(

<AxesSubplot:xlabel='origin_country', ylabel='count'>


from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder

# Change categorical fields to numerical fields
label_encoder = LabelEncoder()

df_train_obj = df.select_dtypes("object")
df_train_nonobj = df.select_dtypes(exclude=['object'])

df_train_obj = df_train_obj.apply(LabelEncoder().fit_transform)

# mash together
df_train = pd.concat([df_train_obj, df_train_nonobj], axis=1)

pca = PCA(n_components=3, random_state=7)
pca_mdl = pca.fit_transform(df_train)
df_pca = pd.DataFrame(pca_mdl)


# imports
from sklearn.cluster import KMeans

# calculate distortion for a range of number of cluster
distortions = []

for i in range(1, 11):
  km = KMeans(
    n_clusters=i, init='random',
    n_init=10, max_iter=300,
    tol=1e-04, random_state=0
  )
  km.fit(df_pca)
  distortions.append(km.inertia_)


# plot
plt.plot(range(1, 11), distortions, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.show()


# our model
km = KMeans(
    n_clusters=4, init='random',
    n_init=10, max_iter=300, 
    tol=1e-04, random_state=0
)
y_km = km.fit_predict(df_pca)


#Using 4 clusters again, visualize for 3 dimensions
fig = plt.figure(figsize=(8,8))
ax = fig.add_subplot(projection = '3d')

x = df_pca[0]
y = df_pca[1]
z = df_pca[2]

ax.set_xlabel("PCA 1")
ax.set_ylabel("PCA 2")
ax.set_zlabel("PCA 3")

ax.scatter(x, y, z, c = y_km)

plt.show()


# calculate distortion for a range of number of cluster
distortions = []

for i in range(1, 11):
  km = KMeans(
    n_clusters=i, init='random',
    n_init=10, max_iter=300,
    tol=1e-04, random_state=0
  )
  km.fit(df_train)
  distortions.append(km.inertia_)


# plot
plt.plot(range(1, 11), distortions, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.show()


# Our model
km = KMeans(
    n_clusters=4, init='random',
    n_init=10, max_iter=300, 
    tol=1e-04, random_state=0
)
y_km = km.fit_predict(df_train)


# Model it
sns.countplot(x="first_air_date", data=df_train, hue=y_km)
plt.xticks(rotation=90)
plt.gcf().set_size_inches(5, 5)


# Grab random entry from our data set
sample = df_train.sample()

# rerun to generate different samples
sample


df_train.shape

(2548, 8)


# find out what cluster it belongs to
label = y_km[sample.index[0]]

# pick random movie from cluster
recommend_show = df_train.loc[np.random.choice(np.where(y_km == label)[0])]

# Display our selection
df.iloc[recommend_show.name]

first_air_date                                              2017-03-19
origin_country                                                      CA
original_language                                                   en
name                                                    Anne with an E
popularity                                                     117.989
vote_average                                                       8.7
vote_count                                                        4064
overview             A coming-of-age story about an outsider who, a...
Name: 6, dtype: object

	first_air_date	origin_country	original_language	name	popularity	vote_average	vote_count	overview
0	2021-09-03	US	en	The D'Amelio Show	30.104	9.0	3071	From relative obscurity and a seemingly normal...
1	2008-01-20	US	en	Breaking Bad	468.253	8.8	10131	When Walter White, a New Mexico chemistry teac...
2	2021-11-06	US	en	Arcane	95.667	8.7	2615	Amid the stark discord of twin cities Piltover...
3	2013-12-02	US	en	Rick and Morty	1511.996	8.7	7220	Rick is a mentally-unbalanced but scientifical...
4	2022-04-14	US	en	The Kardashians	195.038	8.7	1627	The family you know and love is here with a br...

	first_air_date	origin_country	original_language	name	popularity	vote_average	vote_count	overview
2612	2002-06-11	US	en	American Idol	34.052	5.2	135	Each year, hopeful singers from all over the c...
2613	2000-07-05	US	en	Big Brother	47.029	4.9	190	American version of the reality game show whic...
2614	1997-03-31	GB	en	Teletubbies	36.875	4.1	108	Pre-school fun, fantasy and education with col...
2615	1985-02-19	GB	en	EastEnders	108.720	3.9	183	The everyday lives of working-class residents ...
2616	2006-10-09	CA	fr	La Job	6.968	0.6	162	La Job is a French Canadian comedy television ...

	popularity	vote_average	vote_count
count	2548.000000	2548.000000	2548.000000
mean	60.672048	7.691915	613.074961
std	225.264021	0.621754	1237.841064
min	0.866000	0.600000	99.000000
25%	16.767500	7.300000	151.000000
50%	27.756500	7.700000	259.500000
75%	50.548500	8.100000	573.000000
max	6684.611000	9.000000	19459.000000

Project 4¶

Importing Data¶

Loading Data¶

Pre-processing Data¶

Checking data types¶

Checking Null Values¶

Checking Missing Values¶

Checking the dataset info¶

Visualizations¶

Modeling¶

KMeans¶

Modeling¶